In this notebook I identify differentially methylated genes (DMGs) between two Olympia oyster populations, Hood Canal and South Sound. First I prepare my data to be in the correct format / shape, then test for DMGs using a binomial GLM. The genes are also annotated using a gene feature file and BEDtools, and biological functions associated with GO terms are visualized with REVIGO.

Load libraries

list.of.packages <- c("reshape2","dplyr", "tidyr", "readr", "stringr", "plotly") #add new libraries here 
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)

# Load all libraries 
lapply(list.of.packages, FUN = function(X) {
  do.call("require", list(X)) 
})

Load filtered methylation data object which was created in the notebook “01-methylkit”

load(here::here("analyses", "meth_filter")) 

Reshape data to create a new column with sample info, and calculate %methylation for each locus/sample

meth_filter_reshaped <- melt(data=meth_filter, id=c("chr", "start", "end", "strand"), value.name = "count") %>%
 tidyr::separate(col = "variable" , into = c("variable", "sample"), sep = "(?<=[a-zA-Z])\\s*(?=[0-9])") %>%
  dcast(chr+start+end+strand+sample ~  variable) %>%
  mutate(percMeth = 100*(numCs/coverage))
readr::write_delim(meth_filter_reshaped, "../analyses/meth_filter.tab",  delim = '\t', col_names = FALSE)

Use intersectBed to find where loci and genes intersect, allowing loci to be mapped to annotated genes

wb: Print all lines in the second file a: input data file, which was created in previous code chunk b: save annotated gene list

intersectBed \
  -wb \
  -a "../analyses/meth_filter.tab" \
  -b "../genome-features/Olurida_v081-20190709.gene.gff" \
  > "../analyses/meth_filter_gene.tab"

Here is the number of loci associated with gene regions:

wc -l "../analyses/meth_filter_gene.tab"
##    69984 ../analyses/meth_filter_gene.tab

Binomial GLMs to test for differentially methylated genes

Add columns for organization / filtering:

– gene = contig number + start/end locus – group = sample number + gene – population = HC for Hood Canal, or SS for South Sound

library(car)
meth_filter_genes <- 
  read_delim(file = here::here("analyses", "meth_filter_gene.tab"), delim = "\t", col_names = c(colnames(meth_filter_reshaped), "contig_gene", "source_gene", "feature_gene", "start_gene", "end_gene", "unknown1_gene", "strand_gene", "unknown2_gene", "notes_gene")) %>%
  mutate(gene=paste(contig_gene, start_gene, end_gene, sep="_")) %>%
  mutate(group=paste(sample, gene, sep="_")) %>%
  mutate(population=as.character(sample))
## Parsed with column specification:
## cols(
##   chr = col_character(),
##   start = col_double(),
##   end = col_double(),
##   strand = col_character(),
##   sample = col_double(),
##   coverage = col_double(),
##   numCs = col_double(),
##   numTs = col_double(),
##   percMeth = col_double(),
##   contig_gene = col_character(),
##   source_gene = col_character(),
##   feature_gene = col_character(),
##   start_gene = col_double(),
##   end_gene = col_double(),
##   unknown1_gene = col_character(),
##   strand_gene = col_character(),
##   unknown2_gene = col_character(),
##   notes_gene = col_character()
## )
meth_filter_genes$population <- car::recode(meth_filter_genes$population, "c('1', '2', '3', '4', '5', '6', '7', '8', '9')='HC'") 
meth_filter_genes$population <- car::recode(meth_filter_genes$population, "c('10','11','12','13','14','15','16','17','18')='SS'")

Here is the number of genes that are represented in our methylation data set:

length(unique(meth_filter_genes$gene)) 
## [1] 1379

Filter for genes with at minimum 5 methylated loci

min.filt <- count(meth_filter_genes, vars = c( group))
newdata <- min.filt[ which(min.filt$n > 4), ]
sub_meth_table <- meth_filter_genes[meth_filter_genes$group %in% newdata$vars,]

Here is the number of genes that remain after filtering for those with 5 or more methylated loci within each gene:

length(unique(sub_meth_table$gene))  
## [1] 204

Run GLM to test for differences among population for each gene individually

Note: this script was created by Hollie Putnam (GM.Rmd); there are minor revisions below. I retained some commented out lines (notably-testing for position w/n gene, such as intron & exon) in case we want to include those as factors in the future.

# create data frame to stored results
results <- data.frame()

gs <- unique(sub_meth_table$gene)

#first subset the unique dataframes and second run the GLMs
for(i in 1:length(gs)){
  
  #subset the dataframe gene by gene
  sub_meth_table1 <- subset(sub_meth_table, gene ==gs[i])
  
  # fit glm position model
  fit <- glm(matrix(c(numCs, numTs), ncol=2) ~ as.factor(population), 
             data=sub_meth_table1, family=binomial)
  a <- anova(fit, test="Chisq")
  
  # capture summary stats to data frame
  df <- data.frame(sub_meth_table1[c("population", "sample", "contig_gene", "start_gene", "end_gene", "gene", "chr", "start", "sample", "coverage", "numCs", "numTs", "percMeth", "notes_gene")],
                   pval.treatment = a$`Pr(>Chi)`[2],
                   #pval.position = a$`Pr(>Chi)`[3], #uncomment if you want to include position of CpG within a gene
                   #pval.treatment_x_position = a$`Pr(>Chi)`[4], #uncomment if you want to include position of CpG within a gene interaction with treatment
                   stringsAsFactors = F)
  
  # bind rows of temporary data frame to the results data frame
  results <- rbind(results, df)
  
}
results[is.na(results)] <- 0
results$adj.pval.pop <- p.adjust(results$pval.treatment, method='BH')
#results$adj.pval.position <- p.adjust(results$pval.position, method='BH') #uncomment if you want to include position of CpG within a gene
#results$adj.pval.treatment_x_position <- p.adjust(results$pval.treatment_x_position, method='BH') #uncomment if you want to include position of CpG within a gene interaction with treatment

Extract only genes that were differentially methylated (p-adj < 0.05),

DMGs <- subset(results, adj.pval.pop < 0.05) #safe df with differentially methylated genes 
DMGs.genes <- DMGs[!duplicated(DMGs$gene), c("contig_gene", "start_gene", "end_gene", "notes_gene", "pval.treatment", "adj.pval.pop")]

Here is the number of differentially methylated genes:

length(unique(DMGs$gene))  
## [1] 46

Extract GO terms for DMGs and save to file

# split gene data in "notes_gene" column into separate columns 
DMGs.genes <- DMGs.genes %>%
  mutate(ID=str_extract(notes_gene, "ID=(.*?);"),
       Parent=str_extract(notes_gene, "Parent=(.*?);"),
       Name=str_extract(notes_gene, "Name=(.*?);"),
       Alias=str_extract(notes_gene, "Alias=(.*?);"),
       AED=str_extract(notes_gene, "AED=(.*?);"),
       eAED=str_extract(notes_gene, "eAED=(.*?);"),
       Note=str_extract(notes_gene, "Note=(.*?);"),
       Ontology_term=str_extract(notes_gene, "Ontology_term=(.*?);"),
       Dbxref=str_extract(notes_gene, "Dbxref=(.*?);")
       )

#Extract GO terms 
DMGs.genes.GO <- DMGs.genes %>%
  mutate(Ontology_term = str_replace(Ontology_term, pattern="Ontology_term=",replacement = "")) %>%
  mutate(Ontology_term = str_replace(Ontology_term, pattern=";",replacement = "")) %>%
  separate(Ontology_term, sep=",", into=paste("GO", 1:11, sep="_")) %>%
  pivot_longer(cols=c("GO_1","GO_2","GO_3","GO_4","GO_5","GO_6","GO_7","GO_8","GO_9","GO_10","GO_11"), names_to = "GO_number", values_to = "GO_term") %>%
  dplyr::select(-GO_number) %>%
  filter(!is.na(Note) & !is.na(GO_term))
## Warning: Expected 11 pieces. Missing pieces filled with `NA` in 17 rows [5, 7,
## 8, 9, 13, 14, 15, 16, 18, 22, 26, 33, 38, 42, 44, 45, 46].
write_delim(DMGs.genes.GO[,c("GO_term","adj.pval.pop")], path = here::here("analyses/", "DMRs.GO.txt"), delim = '\t', col_names = F) #write out df with just GO terms and p-adj values 

The file “../analyses/DMRs.GO.txt” was opened outside of RMarkdown and the GO terms and p-values were pasted into REVIGO. I then exported the R script to generate the Biological Processes scatterplot. Here is the REVIGO table:

term ID description frequency pin? log10 p-value uniqueness dispensability
GO:0006468 protein phosphorylation 4.137 % -3.7877 0.40 0.00
GO:0006807 nitrogen compound metabolic process 38.744 % -2.2764 0.78 0.03
GO:0006207 ‘de novo’ pyrimidine nucleobase biosynthetic process 0.192 % -2.2764 0.46 0.06
GO:0006281 DNA repair 2.234 % -2.4853 0.50 0.20
GO:0006030 chitin metabolic process 0.077 % -1.6311 0.49 0.21
GO:0006520 cellular amino acid metabolic process 5.591 % -2.2764 0.42 0.35
GO:0006412 translation 5.686 % -2.4853 0.28 0.55
GO:0016567 protein ubiquitination 0.523 % -1.4336 0.44 0.56

Here is the REVIGO plot showing GO terms associated with DMGs

# A plotting R script produced by the REVIGO server at http://revigo.irb.hr/
# If you found REVIGO useful in your work, please cite the following reference:
# Supek F et al. "REVIGO summarizes and visualizes long lists of Gene Ontology
# terms" PLoS ONE 2011. doi:10.1371/journal.pone.0021800


# --------------------------------------------------------------------------
# If you don't have the ggplot2 package installed, uncomment the following line:
# install.packages( "ggplot2" );
library( ggplot2 );
# --------------------------------------------------------------------------
# If you don't have the scales package installed, uncomment the following line:
# install.packages( "scales" );
library( scales );
## Warning: package 'scales' was built under R version 3.5.2
## 
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
## 
##     col_factor
# --------------------------------------------------------------------------
# Here is your data from REVIGO. Scroll down for plot configuration options.

revigo.names <- c("term_ID","description","frequency_%","plot_X","plot_Y","plot_size","log10_p_value","uniqueness","dispensability");
revigo.data <- rbind(c("GO:0006468","protein phosphorylation", 4.137, 4.305,-3.360, 5.725,-3.7877,0.400,0.000),
c("GO:0006807","nitrogen compound\nmetabolic process",38.744, 0.501, 5.551, 6.696,-2.2764,0.783,0.034),
c("GO:0006207","'de novo' pyrimidine\nnucleobase biosynthetic process", 0.192,-3.965,-5.039, 4.392,-2.2764,0.459,0.063),
c("GO:0006281","DNA repair", 2.234, 1.011,-6.623, 5.457,-2.4853,0.500,0.201),
c("GO:0006030","chitin metabolic\nprocess", 0.077,-5.068,-0.868, 3.994,-1.6311,0.491,0.211),
c("GO:0006520","cellular amino acid\nmetabolic process", 5.591,-4.136,-3.302, 5.856,-2.2764,0.423,0.351),
c("GO:0006412","translation", 5.686,-0.187,-3.138, 5.863,-2.4853,0.280,0.551),
c("GO:0016567","protein ubiquitination", 0.523, 4.909,-2.319, 4.827,-1.4336,0.440,0.560));

one.data <- data.frame(revigo.data);
names(one.data) <- revigo.names;
one.data <- one.data [(one.data$plot_X != "null" & one.data$plot_Y != "null"), ];
one.data$plot_X <- as.numeric( as.character(one.data$plot_X) );
one.data$plot_Y <- as.numeric( as.character(one.data$plot_Y) );
one.data$plot_size <- as.numeric( as.character(one.data$plot_size) );
one.data$log10_p_value <- as.numeric( as.character(one.data$log10_p_value) );
one.data$frequency <- as.numeric( as.character(one.data$frequency) );
one.data$uniqueness <- as.numeric( as.character(one.data$uniqueness) );
one.data$dispensability <- as.numeric( as.character(one.data$dispensability) );
head(one.data);
##      term_ID                                           description frequency_%
## 1 GO:0006468                               protein phosphorylation       4.137
## 2 GO:0006807                  nitrogen compound\nmetabolic process      38.744
## 3 GO:0006207 'de novo' pyrimidine\nnucleobase biosynthetic process       0.192
## 4 GO:0006281                                            DNA repair       2.234
## 5 GO:0006030                             chitin metabolic\nprocess       0.077
## 6 GO:0006520                cellular amino acid\nmetabolic process       5.591
##   plot_X plot_Y plot_size log10_p_value uniqueness dispensability frequency
## 1  4.305 -3.360     5.725       -3.7877      0.400          0.000     4.137
## 2  0.501  5.551     6.696       -2.2764      0.783          0.034    38.744
## 3 -3.965 -5.039     4.392       -2.2764      0.459          0.063     0.192
## 4  1.011 -6.623     5.457       -2.4853      0.500          0.201     2.234
## 5 -5.068 -0.868     3.994       -1.6311      0.491          0.211     0.077
## 6 -4.136 -3.302     5.856       -2.2764      0.423          0.351     5.591
# --------------------------------------------------------------------------
# Names of the axes, sizes of the numbers and letters, names of the columns,
# etc. can be changed below

p1 <- ggplot( data = one.data );
p1 <- p1 + geom_point( aes( plot_X, plot_Y, colour = log10_p_value, size = plot_size), alpha = I(0.6) ) + scale_size_area();
p1 <- p1 + scale_colour_gradientn( colours = c("blue", "green", "yellow", "red"), limits = c( min(one.data$log10_p_value), 0) );
p1 <- p1 + geom_point( aes(plot_X, plot_Y, size = plot_size), shape = 21, fill = "transparent", colour = I (alpha ("black", 0.6) )) + scale_size_area();
## Scale for 'size' is already present. Adding another scale for 'size', which
## will replace the existing scale.
p1 <- p1 + scale_size( range=c(5, 30)) + theme_bw(); # + scale_fill_gradientn(colours = heat_hcl(7), limits = c(-300, 0) );
## Scale for 'size' is already present. Adding another scale for 'size', which
## will replace the existing scale.
#ex <- one.data [ one.data$dispensability < 0.15, ]; 
p1 <- p1 + geom_text( data = one.data, aes(plot_X, plot_Y, label = as.character(description)), colour = I(alpha("black", 0.85)), size = 3 );
p1 <- p1 + labs (y = "semantic space x", x = "semantic space y");
p1 <- p1 + theme(legend.key = element_blank()) ;
one.x_range = max(one.data$plot_X) - min(one.data$plot_X);
one.y_range = max(one.data$plot_Y) - min(one.data$plot_Y);
p1 <- p1 + xlim(min(one.data$plot_X)-one.x_range/10,max(one.data$plot_X)+one.x_range/10);
p1 <- p1 + ylim(min(one.data$plot_Y)-one.y_range/10,max(one.data$plot_Y)+one.y_range/10);

# --------------------------------------------------------------------------
# Output the plot to screen

ggplotly(p1);
# Uncomment the line below to also save the plot to a file.
# The file type depends on the extension (default=pdf).

# ggsave("C:/Users/path_to_your_file/revigo-plot.pdf");